# visualize structural topic model results

# note: the data necessary to run this code contains sensitive information,
# and is therefore omitted
library(here)

here::here()

source("code/startup.R")
library(stm)

load("data omitted for privacy")

library(ggbeeswarm)

fits_sub$Topic41012 <- with(fits_sub, Topic4 + Topic10 + Topic12)
fits$Topic41012 <- with(fits, Topic4 + Topic10 + Topic12)

# pull out random sample of docs/topic proportions
set.seed(11111)
lazy_dt(fits) %>%
  dplyr::select(Candidate.Intake.Form..why,
                starts_with("Topic"),
                -Topic4, -Topic10, -Topic12) %>%
  mutate_at(.vars = dplyr::vars(starts_with("Topic")),
            .funs = function(i){
              round(i, 3)
            }) %>%
  rename(educators = Topic1,
         progressives = Topic2,
         education = Topic3,
         representation = Topic5,
         political_dynamics = Topic6,
         trump = Topic7,
         previous_interest = Topic8,
         male_identity = Topic9,
         preliminary = Topic11,
         national_issues = Topic13,
         local_dynamics = Topic14,
         populism = Topic15,
         health = Topic16,
         community = Topic17,
         generic_change = Topic41012) %>%
  slice(sample(1:n(), 500, replace = F)) %>%
  write.csv(file = "output omitted")
  

topictable <- data.frame(topic = paste0("Topic", c(1:3, 5:9, 11, 13:17, 41012)),
                         label = c("Educators","Progressives","Education",
                                   "Representation","Political Dynamics","Trump",
                                   "Previous Interest","Male Identity",
                                   "Preliminary","National Issues",
                                   "Local Dynamics","Populism","Health","Community",
                                   "Generic Change"))
topictable$topicnum = 1:nrow(topictable)

topicdist <- as_tibble(fits_sub) %>% select(c(1:3, 5:9, 11, 13:18, 26)) %>%
  group_by(running) %>%
  reshape2::melt(id.vars = c("running")) %>%
  ggplot(aes(x = variable, y = value))+
  facet_wrap(~running, nrow = 2, ncol = 1, labeller = as_labeller(c("0" = "Non-Candidates", "1" = "Candidates")))+
  geom_quasirandom()+
  labs(y = "Proportion", title = "Distributions of Topic Proportions by Candidate Emergence")+
  scale_x_discrete(name = "Topic",
                   labels = c("Educators","Progressives","Education",
                              "Representation","Political Dynamics","Trump",
                              "Previous Interest","Male Identity",
                              "Preliminary","National Issues",
                              "Local Dynamics","Populism","Health","Community",
                              "Generic Change"))+
  theme_bw()+
  theme(text=element_text(family="Times New Roman", size=16),
        plot.title = element_text(face = "bold", size = 24),
        plot.subtitle = element_text(size = 16),
        axis.title.x = element_text(size = 20),
        strip.text = element_text(size = 20))
ggsave(topicdist, file = "figures/topicdist_combine_generic_change.png", width = 14, height = 10)


median_topic_proportions <- 
  bind_rows(as_tibble(fits_sub) %>% select(c(1:3, 5:9, 11, 13:17, 26, 18)) %>%
  summarise_at(.vars = dplyr::vars(starts_with("Topic")),
               .funs = median) %>%
    mutate(sample = "All Respondents"),
  as_tibble(fits_sub) %>% select(c(1:3, 5:9, 11, 13:17, 26, 18)) %>%
    filter(running == 1) %>%
    summarise_at(.vars = dplyr::vars(starts_with("Topic")),
                 .funs = median) %>%
    mutate(sample = "Candidates"),
  as_tibble(fits_sub) %>% select(c(1:3, 5:9, 11, 13:17, 26, 18)) %>%
  filter(running == 0) %>%
  summarise_at(.vars = dplyr::vars(starts_with("Topic")),
               .funs = median) %>%
  mutate(sample = "Non-Candidates")) %>%
  reshape2::melt(id.vars = c("sample")) %>%
  arrange(sample, -value) %>%
  left_join(topictable, by = c("variable" = "topic")) %>%
  ggplot(aes(x = fct_rev(fct_inorder(label)), 
             y = value, col = "black",
             fill = factor(sample, levels = c("Candidates","Non-Candidates", "All Respondents"))))+
  geom_bar(position = "dodge", stat = "identity")+
  coord_flip()+
  scale_color_manual(name = "",
                     breaks = "black",
                     values = "black",
                     labels = "")+
  guides(col = FALSE)+
  scale_fill_grey(name = "",
                  start = 0.1, end = 0.9,
                  breaks = c("All Respondents","Non-Candidates","Candidates"),
                  labels = c("All Respondents","Non-Candidates","Candidates"))+
  scale_x_discrete(name = "Topic")+
  labs(y = "Median Topic Proportion",
       title = "Topic Prevalence by Candidate Emergence",
       subtitle = "Topics sorted by prevalence among all respondents")+
  theme_bw()+
  theme(text=element_text(family="Times New Roman", size=16),
        plot.title = element_text(face = "bold", size = 24),
        plot.subtitle = element_text(size = 16),
        axis.title.x = element_text(size = 20),
        strip.text = element_text(size = 20))
ggsave(median_topic_proportions, file = "figures/median_topic_proportions.png", width = 10, height = 6)

source("code/stm_helpers.R")
set.seed(3333)

gendiff.p <- plotSTMdiff.flex(rfs_est, covariate = "gen.comb.imp", topics = c(1:17),
                         model = rfs_fit, method = "difference",
                         cov.value1 = 1, cov.value2 = 0, combine = c(4, 10, 12),
                         flex = "data")

racdiff.p <- plotSTMdiff.flex(rfs_est, covariate = "race.comb", topics = c(1:17),
                         model = rfs_fit, method = "difference",
                         cov.value1 = 1, cov.value2 = 0, combine = c(4, 10, 12),
                         flex = "data")

urdiff.p <- plotSTMdiff.flex(rfs_est, covariate = "USR", topics = c(1:17),
                        model = rfs_fit, method = "difference",
                        cov.value1 = "U", cov.value2 = "R", combine = c(4, 10, 12),
                        flex = "data")
sudiff.p <- plotSTMdiff.flex(rfs_est, covariate = "USR", topics = c(1:17),
                        model = rfs_fit, method = "difference",
                        cov.value1 = "S", cov.value2 = "U", combine = c(4, 10, 12),
                        flex = "data")

srdiff.p <- plotSTMdiff.flex(rfs_est, covariate = "USR", topics = c(1:17),
                        model = rfs_fit, method = "difference",
                        cov.value1 = "S", cov.value2 = "R", combine = c(4, 10, 12),
                        flex = "data")

colldiff.p <- plotSTMdiff.flex(rfs_est, covariate = "college.cat", topics = c(1:17),
                          model = rfs_fit, method = "difference",
                          cov.value1 = "mc", cov.value2 = "mnc", combine = c(4, 10, 12),
                          flex = "data")

rundiff.p <- plotSTMdiff.flex(rfs_est, covariate = "running", topics = c(1:17),
                         model = rfs_fit, method = "difference",
                         cov.value1 = 1, cov.value2 = 0, combine = c(4, 10, 12),
                         flex = "data")

incdiff.p <- plotSTMdiff.flex(rfs_est, covariate = "inc.cat", topics = c(1:17),
                         model = rfs_fit, method = "difference",
                         cov.value1 = "over250", cov.value2 = "under50", combine = c(4, 10, 12),
                         flex = "data")

prevalence_order <- c(15, 4, 7, 5, 12, 11, 9, 14, 1, 10, 6, 2, 3, 8, 13)
topictable <- topictable[prevalence_order,]

gendiff.p <- gendiff.p %>%
  left_join(topictable[,c("topicnum","label")], by = c("topic" = "topicnum"))

gendiff <- gendiff.p[prevalence_order,] %>%
  ggplot(aes(x = fct_rev(fct_inorder(label)), 
             y = mu, ymin = lwr, ymax = upr, col = factor(sig)))+
  geom_pointrange()+
  scale_color_manual(name = "",
                     breaks = c(0,1),
                     values = c("grey","black"),
                     labels = c("ns","s"))+
  guides(color = FALSE)+
  coord_flip()+
  theme_bw()+
  theme(text=element_text(family="Times New Roman", size=16),
        plot.title = element_text(face = "bold", size = 20))+
  ylim(-.1, .1)+
  labs(title = "Gender",
       y = "<<< More Female      More Male >>>")+
  scale_x_discrete(name = "Topic")+
  theme(axis.title.x = element_text(vjust = -.5))

colldiff.p <- colldiff.p %>%
  left_join(topictable[,c("topicnum","label")], by = c("topic" = "topicnum"))

colldiff <- colldiff.p[prevalence_order,] %>%
  ggplot(aes(x = fct_rev(fct_inorder(label)), 
             y = mu, ymin = lwr, ymax = upr, col = factor(sig)))+
  geom_pointrange()+
  scale_color_manual(name = "",
                     breaks = c(0,1),
                     values = c("grey","black"),
                     labels = c("ns","s"))+
  guides(color = FALSE)+
  coord_flip()+
  theme_bw()+
  theme(text=element_text(family="Times New Roman", size=16),
        plot.title = element_text(face = "bold", size = 20))+
  ylim(-.1, .1)+
  labs(title = "Education",
       y = "<<< More Non-College      More College >>>")+
  scale_x_discrete(name = "Topic")+
  theme(axis.title.x = element_text(vjust = -.5))

racdiff.p <- racdiff.p %>%
  left_join(topictable[,c("topicnum","label")], by = c("topic" = "topicnum"))

racdiff <- racdiff.p[prevalence_order,] %>%
  ggplot(aes(x = fct_rev(fct_inorder(label)), 
             y = mu, ymin = lwr, ymax = upr, col = factor(sig)))+
  geom_pointrange()+
  scale_color_manual(name = "",
                     breaks = c(0,1),
                     values = c("grey","black"),
                     labels = c("ns","s"))+
  guides(color = FALSE)+
  coord_flip()+
  theme_bw()+
  theme(text=element_text(family="Times New Roman", size=16),
        plot.title = element_text(face = "bold", size = 20))+
  ylim(-.1, .1)+
  labs(title = "Race",
       y = "<<< More Non-White      More White >>>")+
  scale_x_discrete(name = "")+
  theme(axis.text.y = element_blank(),
        axis.title.x = element_text(vjust = -.5))

rundiff.p <- rundiff.p %>%
  left_join(topictable[,c("topicnum","label")], by = c("topic" = "topicnum"))

rundiff <- rundiff.p[prevalence_order,] %>%
  ggplot(aes(x = fct_rev(fct_inorder(label)), 
             y = mu, ymin = lwr, ymax = upr, col = factor(sig)))+
  geom_pointrange()+
  scale_color_manual(name = "",
                     breaks = c(0,1),
                     values = c("grey","black"),
                     labels = c("ns","s"))+
  guides(color = FALSE)+
  coord_flip()+
  theme_bw()+
  theme(text=element_text(family="Times New Roman", size=16),
        plot.title = element_text(face = "bold", size = 20))+
  ylim(-.1, .1)+
  labs(title = "Emergence",
       y = "<<< More Not Running      More Running >>>")+
  scale_x_discrete(name = "")+
  theme(axis.text.y = element_blank(),
        axis.title.x = element_text(vjust = -.5))


ggsave(gendiff, file = "figures/gendiff_matched_combined.png", width = 8, height = 7)
ggsave(racdiff, file = "figures/racdiff_matched_combined.png", width = 8, height = 7)
ggsave(urdiff, file = "figures/urdiff_matched_combined.png", width = 8, height = 7)
ggsave(colldiff, file = "figures/colldiff_matched_combined.png", width = 8, height = 7)
ggsave(rundiff, file = "figures/rundiff_matched_combined.png", width = 8, height = 7)

draw_label_theme <- function(label, theme = NULL, element = "text", ...) {
  if (is.null(theme)) {
    theme <- ggplot2::theme_get()
  }
  if (!element %in% names(theme)) {
    stop("Element must be a valid ggplot theme element name")
  }
  
  elements <- ggplot2::calc_element(element, theme)
  
  cowplot::draw_label(label, 
                      fontfamily = elements$family,
                      fontface = elements$face,
                      colour = elements$color,
                      size = elements$size,
                      ...
  )
}
library(cowplot)
title <- ggdraw() +
  draw_label_theme("Differences in Expected Topic Proportions by Respondent Characteristics", 
                   theme = theme_bw()+
                     theme(text=element_text(family="Times New Roman", size=16),
                           plot.title = element_text(face = "bold", size = 20),
                           plot.subtitle = element_text(size = 16)), 
                   element = "plot.title",
                   x = 0.05, hjust = 0, vjust = 1)
subtitle <- ggdraw() +
  draw_label_theme("Topics sorted by median proportion among all respondents",
                   theme = theme_bw()+
                     theme(text=element_text(family="Times New Roman", size=16),
                           plot.title = element_text(face = "bold", size = 20),
                           plot.subtitle = element_text(size = 16)), 
                   element = "plot.subtitle",
                   x = 0.05, hjust = 0, vjust = 1)

topicdiffs <- cowplot::plot_grid(gendiff, racdiff, colldiff, rundiff,
                                 nrow = 2, ncol = 2, widths = c(1.275, 1))
topicdiffs_wt <- 
  plot_grid(title, subtitle, topicdiffs, ncol = 1, 
          rel_heights = c(0.06, 0.06, 1))

ggsave(topicdiffs_wt, file = "figures/topicdiffs_matched_combined.png", width = 12, height = 8)


sortorder <- c(5,11,3, 13, 14, 8, 2, 4, 12, 1, 10, 6, 15, 7, 9)


alldiff.p <- bind_rows(rundiff.p[sortorder,] %>% mutate(covar = "Emergence (Non-Candidate to Candidate)"),
                       colldiff.p[sortorder,] %>% mutate(covar = "Education (Non-College to College)"),
                       gendiff.p[sortorder,] %>% mutate(covar = "Gender (Female to Male)"), 
                     racdiff.p[sortorder,] %>% mutate(covar = "Race (Non-White to White)"))
alldiff.p$topic <- factor(alldiff.p$topic)
alldiff.p$label <- factor(alldiff.p$label)
alldiff.p$covar <- factor(alldiff.p$covar)
alldiff.p$topic_order <- rep(1:15, 4)

alldiff <- alldiff.p %>%
  arrange(desc(topic_order, covar)) %>%
  ggplot(aes(x = fct_inorder(label), 
             y = mu, ymin = lwr, ymax = upr, 
             col = (covar == "Emergence (Non-Candidate to Candidate)"),
             pch = fct_inorder(covar),
             group = fct_inorder(covar)))+
  geom_pointrange(position = position_dodge(width = .7))+
  scale_color_manual(name = "",
                     breaks = c(FALSE,TRUE),
                     values = c("grey","black"),
                     labels = c("ns","s"))+
  scale_shape_manual(name = "",
                     breaks = unique(alldiff.p$covar)[c(4,3,2,1)],
                     values = c(5,6, 7, 19),
                     labels = unique(alldiff.p$covar)[c(4,3,2,1)])+
  guides(color = FALSE)+
  coord_flip()+
  ylim(-.1, .1)+
  labs(title = "Differences in Expected Topic Proportions by Respondent Characteristics",
       y = "Expected Difference in Proportion",
       subtitle = "Topics sorted by difference in expected proportion between candidates and non-candidates")+
  scale_x_discrete(name = "Topic")+
  theme_bw()+
  theme(text=element_text(family="Times New Roman", size=16),
        plot.title = element_text(face = "bold", size = 20))
ggsave(alldiff, file = "figures/topicdiffs_matched_combined_nofacet.png", width = 12, height = 8)




